# load data and fit models from last time wells <- read.csv("ecol 563/wells.txt") out1 <- glm(cbind(y, n-y)~ land.use + sewer, data=wells, family=binomial) wells$land.use2 <- factor(ifelse(wells$land.use %in% c('agri','undev'), 'rural', as.character(wells$land.use))) wells$land.use3 <- factor(ifelse(wells$land.use2 %in% c('inst','recr','resL','resM','trans'), 'mixed', as.character(wells$land.use2))) wells$land.use4 <- factor(ifelse(wells$land.use3 %in% c('resH','comm','indus'), 'high.use', as.character(wells$land.use3))) out2d <- glm(cbind(y,n-y)~land.use4+sewer, data=wells, family=binomial) # predicted logits predict(out2d) # predicted probabilities predict(out2d, type='response') # predicted means predict(out2d, type='response')*wells$n ei <- predict(out2d, type='response')*wells$n # expected number of contaminated and uncontaminated wells Ei <- cbind(ei, wells$n-ei) # observed number of contaminated and uncontaminated wells Oi <- cbind(wells$y, wells$n-wells$y) # Pearson statistic sum((Oi-Ei)^2/Ei) # p-value of Pearson statistic 1-pchisq(sum((Oi-Ei)^2/Ei),16) # G-test statistic sum(ifelse(Oi==0,0,2*Oi*log(Oi/Ei))) # residual deviance = G-test statistic deviance(out2d) # degrees of freedom for G-test df.residual(out2d) # deviance goodness of fit 1-pchisq(deviance(out2d), df.residual(out2d)) # 35% of predicted counts are less than 5 sum(Ei<=5)/length(Ei) # function for parametric bootstrap sim.func <- function() { obs <- sapply(1:nrow(wells), function(x) rbinom(1, prob=fitted(out2d)[x], size=wells$n[x])) my.glm <- glm(cbind(obs,n-obs)~land.use4 + sewer, data=wells, family=binomial) ei <- fitted(my.glm)*wells$n Ei <- cbind(ei, wells$n-ei) Oi <- cbind(obs, wells$n-obs) # pearson test pearson <- sum((Oi-Ei)^2/Ei) # G-test gtest <- sum(ifelse(Oi==0,0,2*Oi*log(Oi/Ei))) # residual deviance dev <- deviance(my.glm) # return residual deviance dev } # obtain 999 residual deviances from model sims <- replicate(999, sim.func()) # append actual residual deviance to make 1000 sims <- c(sims, deviance(out2d)) # proportion of simulated values that exceed actual value sum(sims>=deviance(out2d))/length(sims) # if ratio of residual deviance to df > 1, overdispersion? deviance(out2d)/df.residual(out2d) sum((Oi-Ei)^2/Ei) # if ratio of Pearson deviance to df > 1, overdispersion? sum((Oi-Ei)^2/Ei)/df.residual(out2d) # if overdispersion is real and cannot be fixed we can fit quasi-binomial # quasi-binomial model out2d.quasi <- glm(cbind(y,n-y)~land.use4+sewer, data=wells, family=quasibinomial) summary(out2d.quasi) # same estimates as binomial but different standard errors summary(out2d.quasi)$coefficients summary(out2d)$coefficients # the standard errors are obtained from the Pearson deviance phi <- sum((Oi-Ei)^2/Ei)/df.residual(out2d) phi sqrt(phi)*summary(out2d)$coefficients[,2] # add continuous predictors to the logistic regression model out2g <- update(out2d, .~.+nitrate) out2h <- update(out2d, .~.+chloride) out2i <- update(out2d, .~.+chloride+nitrate) AIC(out2g, out2h, out2i, out2d) anova(out2d, out2g, test='Chisq') # is there a significant interaction? out2j<-update(out2d, .~. +land.use4:sewer) anova(out2d,out2j, test='Chisq') # obtaining odds ratios # OR for contamination: mixed versus high use exp(coef(out2d)[2]) # OR for contamination: high use versus mixed exp(-coef(out2d)[2]) # OR for contamination: sewers present versus sewers absent exp(coef(out2d)[4]) # OR for contamination: mixed versus rural exp(coef(out2d)[2]-coef(out2d)[3]) # OR for contamination for a one unit change in nitrate exp(coef(out2g)[5]) # OR for contamination for a three unit change in nitrate exp(3*coef(out2g)[5]) # confidence intervals for the regression parameters confint(out2d) out.ci <- confint(out2d) # 95% confidence intervals for the odds ratios # mixed versus high and rural versus high exp(out.ci[2,]) # OR for contamination: high use versus mixed exp(-coef(out2d)[2]) # 95% confidence interval for that odds ratio exp(-out.ci[2,]) # OR for contamination: high use versus rural exp(-coef(out2d)[3]) # 95% confidence interval for that odds ratio exp(-out.ci[3,]) # log odds ratio for contamination: mixed versus rural est <- c(0,1,-1,0)%*%coef(out2d) # standard error of that log odds ratio se <- sqrt(c(0,1,-1,0)%*%vcov(out2d)%*%c(0,1,-1,0)) # 95% confidence interval for the log odds ratio est + c(qnorm(.025),qnorm(.975))*se # 95% confidence interval for the odds ratio: mixed versus rural exp(est + c(qnorm(.025),qnorm(.975))*se) # point estimate exp(est)
Created by Pretty R at inside-R.org